使用go.Scatter()函数可以绘制散点图、气泡图和线形图,常用参数如下:
# 3-1 Line and Scatter Plots 多类型散点图
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0, 1, N) # linspace()函数返回N个0-1均匀分布的样本
y0 = np.random.randn(N) + 5
y1 = np.random.randn(N) # random.randn()函数返回N个服从标准正态分布(均值0方差1)的随机样本
y2 = np.random.randn(N) - 5
fig = go.Figure()
# 添加traces
fig.add_trace(go.Scatter(x=x, y=y0,
mode='markers', # 纯散点的绘图
name='markers'))
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers', # 散点+线的绘图
name='lines+markers'))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines', # 线的绘图
name='lines'))
fig.show()
# 3-2 Styled Scatter Plots 风格化散点图
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0, 1, N) # linspace()函数返回N个0-1均匀分布的样本
y0 = np.random.randn(N) + 5
y1 = np.random.randn(N) # random.randn()函数返回N个服从标准正态分布(均值0方差1)的随机样本
y2 = np.random.randn(N) - 5
fig = go.Figure()
# 添加traces
fig.add_trace(go.Scatter(x=x, y=y0,
mode='markers', # 纯散点的绘图
name='markers',
marker=dict(
size=10, # 设置点的大小
color='#f79337' # 设置点的颜色
)))
fig.add_trace(go.Scatter(x=x, y=y1,
mode='lines+markers', # 散点+线的绘图
name='lines+markers',
marker=dict(
size=10,
color='#E15759',
line=dict(width=1, color='rgba(0,0,0,0.2)') # 设置点的描边大小和颜色
)))
fig.add_trace(go.Scatter(x=x, y=y2,
mode='lines', # 线的绘图
name='lines',
line=dict(
width=2, # 设置线条的宽度
color='#76B7B2' # 设置线条的颜色
)))
fig.show()
# 3-3 Simple Bubble Chart 气泡图_设置气泡大小
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(50,size=N) # random.randint()函数返回N个50以内的整数
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
marker=dict(size=z), # 设置气泡的大小,可当作第三个维度
))
fig.show()
# 延伸:缩放气泡大小可以使用属性sizeref,建议使用以下公式来计算sizeref值:
# sizeref =2.* max(size数组)/(所需的标记 ** 2)
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(500,size=N) # random.randint()函数返回N个500以内的整数
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
marker=dict(
size=z,
# sizemode='are',
sizeref=2.*max(z)/(10**2) # 使用属性sizeref缩放气泡大小
)))
fig.show()
# 3-4 Simple Bubble Chart 气泡图_设置气泡颜色
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(50,size=N) # random.randint()函数返回N个50以内的整数
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
text=z, # 设置显示的文本
marker=dict(size=z, # 固定气泡大小
color=z, # 设置气泡的颜色,可当作第三个维度
colorscale='Viridis', # 设置colorscale(颜色范围)
showscale=True, # 设置是否显示scale(右侧的颜色条)
opacity=0.6 # 设置气泡的不透明度
)))
fig.show()
# 3-5 Simple Bubble Chart 气泡图_自定义文本标签
import plotly.graph_objects as go
import numpy as np
N = 100
x = np.linspace(0,1,N)
y = np.random.randn(N)
z = np.random.randint(50,size=N) # random.randint()函数返回N个50以内的整数
text=[]
for i in range(N):
text.append(('x={0}<br>y={1}<br>size={2}').format(np.round(x[i],2),np.round(y[i],2),z[i]))
fig = go.Figure(data=go.Scatter(
x=x,y=y,
mode='markers',
text=text,
hoverinfo='text',
marker=dict(size=z, # 设置气泡的大小,可当作第三个维度
color=z, # 设置气泡的颜色,可当作第三个维度
colorscale='Viridis', # 设置colorscale(颜色范围)
showscale=True, # 设置是否显示scale(右侧的颜色条)
opacity=0.6 # 设置气泡的不透明度
)))
fig.show()
导入数据文件'Sample - Superstore.xls',绘制散点图,展示商品子类别(Sub-Category)中'Paper'销售额(Sales)和利润(Profit)的相关关系,用气泡的颜色来展示Discount的取值大小,从而进一步分析这些变量之间的关系。
# Step1. 导入数据:导入文件'Sample - Superstore.xls',并查看数据
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
df.shape
(9994, 21)
# Step2. 数据筛选:对'Sub-Category'中'Paper'产品进行筛选
data = df[df['Sub-Category']=='Paper'] # 布尔值索引
data.shape
(1370, 21)
data.head() # 查看筛选的结果
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 13 | CA-2018-114412 | 2018-04-15 | 2018-04-20 | Standard Class | AA-10480 | Andrew Allen | Consumer | United States | Concord | ... | 28027.0 | South | OFF-PA-10002365 | Office Supplies | Paper | Xerox 1967 | 15.552 | 3 | 0.2 | 5.4432 |
| 34 | 35 | CA-2018-107727 | 2018-10-19 | 2018-10-23 | Second Class | MA-17560 | Matt Abelman | Home Office | United States | Houston | ... | 77095.0 | Central | OFF-PA-10000249 | Office Supplies | Paper | Easy-staple paper | 29.472 | 3 | 0.2 | 9.9468 |
| 56 | 57 | CA-2017-111682 | 2017-06-17 | 2017-06-18 | First Class | TB-21055 | Ted Butterfield | Consumer | United States | Troy | ... | 12180.0 | East | OFF-PA-10001569 | Office Supplies | Paper | Xerox 232 | 32.400 | 5 | 0.0 | 15.5520 |
| 58 | 59 | CA-2017-111682 | 2017-06-17 | 2017-06-18 | First Class | TB-21055 | Ted Butterfield | Consumer | United States | Troy | ... | 12180.0 | East | OFF-PA-10000587 | Office Supplies | Paper | Array Parchment Paper, Assorted Colors | 14.560 | 2 | 0.0 | 6.9888 |
| 64 | 65 | CA-2016-135545 | 2016-11-24 | 2016-11-30 | Standard Class | KM-16720 | Kunst Miller | Consumer | United States | Los Angeles | ... | 90004.0 | West | OFF-PA-10003892 | Office Supplies | Paper | Xerox 1943 | 146.730 | 3 | 0.0 | 68.9631 |
5 rows × 21 columns
# Step3. 绘制散点图
import plotly.graph_objects as go
fig = go.Figure(go.Scatter(
x=data['Sales'],
y=data['Profit'],
mode='markers',
marker=dict(
color=data['Discount'], # 用颜色表示 Discount
size=12 ),
text=data['Discount'],
opacity=0.7
))
fig.update_layout(
title='Sales and Profit Distrubution of Paper',
xaxis=dict(title='Sales'),
yaxis=dict(title='Profit')
)
fig.show()
# 延伸;用气泡的颜色来展示不同地区(Region)的数据点
fig = go.Figure()
for i in data['Region'].unique():
fig.add_trace(go.Scatter(
x = data.loc[data['Region']==i,'Sales'],
y = data.loc[data['Region']==i,'Profit'],
mode='markers',
name=i,
marker_size=12,
opacity=0.7
))
fig.update_layout(
title='Sales and Profit Distrubution of Paper by Region',
xaxis=dict(title='Sales'),
yaxis=dict(title='Profit')
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制气泡图,展示销售额最高的前50名客户的销售额(X轴)和利润(Y轴)的关系,气泡的大小size和颜色color均体现折扣(Discount)这一变量,交互时增加显示的文本text:Customer Name和Discount(如交互所示)。
# Step1. 导入数据:导入文件'Sample - Superstore.xls',并查看数据
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
df.shape
(9994, 21)
# Step2. 数据分析得到销售额最高的前50名客户的销售额(Sales)、利润(Profit)和折扣(Discount)
data = df.groupby('Customer Name').agg({'Sales':'sum','Discount':'mean','Profit':'sum'})
data = data.sort_values(by='Sales',ascending=False)
data = data[:50]
data
| Sales | Discount | Profit | |
|---|---|---|---|
| Customer Name | |||
| Sean Miller | 25043.0500 | 0.246667 | -1980.7393 |
| Tamara Chand | 19052.2180 | 0.116667 | 8981.3239 |
| Raymond Buch | 15117.3390 | 0.094444 | 6976.0959 |
| Tom Ashbrook | 14595.6200 | 0.080000 | 4703.7883 |
| Adrian Barton | 14473.5710 | 0.240000 | 5444.8055 |
| Ken Lonsdale | 14175.2290 | 0.200000 | 806.8550 |
| Sanjit Chand | 14142.3340 | 0.063636 | 5757.4119 |
| Hunter Lopez | 12873.2980 | 0.018182 | 5622.4292 |
| Sanjit Engle | 12209.4380 | 0.110526 | 2650.6769 |
| Christopher Conant | 12129.0720 | 0.281818 | 2177.0493 |
| Todd Sumrall | 11891.7510 | 0.116667 | 2371.7144 |
| Greg Tran | 11820.1200 | 0.100000 | 2163.4269 |
| Becky Martin | 11789.6300 | 0.168750 | -1659.9581 |
| Seth Vernon | 11470.9500 | 0.156250 | 1199.4242 |
| Caroline Jumper | 11164.9740 | 0.188500 | 858.7414 |
| Clay Ludtke | 10880.5460 | 0.114286 | 1933.7831 |
| Maria Etezadi | 10663.7280 | 0.131818 | 1859.4695 |
| Karen Ferguson | 10604.2660 | 0.033333 | 1660.1386 |
| Bill Shonely | 10501.6530 | 0.011111 | 2616.0644 |
| Edward Hooks | 10310.8800 | 0.071875 | 1393.5154 |
| John Lee | 9799.9230 | 0.088235 | 228.9070 |
| Grant Thornton | 9351.2120 | 0.250000 | -4108.6589 |
| Helen Wasserman | 9300.2540 | 0.045000 | 2164.1611 |
| Tom Boeckenhauer | 9133.9900 | 0.070588 | 2798.3689 |
| Peter Fuller | 9062.8640 | 0.121053 | -614.2943 |
| Christopher Martinez | 8954.0200 | 0.120000 | 3899.8904 |
| Justin Deggeller | 8828.0305 | 0.055882 | 1619.5199 |
| Joe Elijah | 8697.8430 | 0.322727 | 1262.2926 |
| Laura Armstrong | 8673.2220 | 0.115385 | 2059.1199 |
| Pete Kriz | 8646.9340 | 0.076000 | 2038.2676 |
| Daniel Raglin | 8350.8680 | 0.153846 | 2869.0760 |
| Natalie Fritzler | 8322.8260 | 0.250000 | -1695.9714 |
| Karen Daniels | 8282.3580 | 0.187500 | 1107.6952 |
| Nick Crebassa | 8241.7390 | 0.136667 | 1314.7580 |
| Harry Marie | 8236.7648 | 0.231000 | 2437.9836 |
| Keith Dawkins | 8181.2560 | 0.087500 | 3038.6254 |
| Sean Braxton | 8057.8910 | 0.241176 | -2082.7451 |
| Zuschuss Carroll | 8025.7070 | 0.254839 | -1032.1490 |
| Joseph Holt | 7954.9980 | 0.085714 | -644.6982 |
| Nora Preis | 7903.1825 | 0.196154 | 631.2282 |
| Anna Häberlin | 7888.2940 | 0.217391 | 1298.0166 |
| Adam Bellavance | 7755.6200 | 0.044444 | 2054.5885 |
| Jim Epp | 7754.9760 | 0.160000 | 1623.4019 |
| Jane Waco | 7721.7140 | 0.071429 | 2173.7094 |
| Lena Creighton | 7663.1260 | 0.156522 | 1288.3469 |
| John Murray | 7625.0760 | 0.184615 | 1574.6164 |
| Jonathan Doherty | 7610.8640 | 0.075000 | 1050.2668 |
| Patrick O'Brill | 7473.8282 | 0.210000 | 38.4757 |
| Maribeth Schnelling | 7443.6900 | 0.160417 | 844.9355 |
| Rick Wilson | 7397.4010 | 0.134783 | 1586.6273 |
# Step3. 绘制气泡图
import plotly.graph_objects as go
text=[]
for i in range(50):
text.append(('Customer Name: {0}<br>Discount: {1:.2%}').format(data.index[i],data.iloc[i,1]))
fig = go.Figure(go.Scatter(
x=data['Sales'],
y=data['Profit'],
mode='markers',
marker=dict(
color=data['Discount'],
colorscale='viridis',
showscale=True,
size=data['Discount'],
sizeref=2*max(data['Discount'])/(10**2)),
text=text,
))
fig.update_layout(
title='Top-50 Customers\' Sales and Profit',
xaxis_title='Sales',
yaxis_title='Profit'
)
fig.show()
Plotly对时间序列的支持比较友好,既支持字符串格式,又支持日期/时间格式。只要传入的参数x是datetime.datetime对象,或者字符串strings,Plotly会自动识别为时间格式。
如果只想展示部分时间范围内的绘图结果,可以在布局layout中通过传递一个range参数即可实现这个功能。如果想要恢复默认的时间范围,可以单击界面右上角的Autoscale按钮。
# 3-6 Time Series 时间序列
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import datetime
# datetime.datetime对象
x0 = [datetime.datetime(year=2020, month=10, day=1),
datetime.datetime(year=2020, month=10, day=3),
datetime.datetime(year=2020, month=10, day=5)]
y0 = [0,1,2]
# 字符串对象或array对象
x1 = np.array(['2020-10-01','2020-10-03','2020-10-05'])
y1 = [2,1,0]
# 使用pandas的date_range生成datetime对象
x2 = pd.date_range('20201001',periods=31)
y2 = np.random.randn(31)
fig = go.Figure()
fig.add_trace(go.Scatter(x=x0,y=y0,name='trace_datetime'))
fig.add_trace(go.Scatter(x=x1,y=y1,name='trace_string'))
fig.add_trace(go.Scatter(x=x2,y=y2,name='trace_daterange'))
fig.update_traces(opacity=0.8)
# 使用range参数展示部分时间范围内的绘图结果
fig.update_layout(
xaxis=dict(range=[x2[0],x2[6]]) # 只显示前7天的绘图结果
# xaxis_range=[datetime.datetime(2020,10,1),datetime.datetime(2020,10,7)]
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制时间序列图,展示2018年每天的销售额(Sales)和利润(Profit)。
# 导入数据并进行数据分析
import pandas as pd
df = pd.read_excel('Sample - Superstore.xls',sheet_name='Orders')
# 每天销售额和利润
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data
| Sales | Profit | |
|---|---|---|
| Order Date | ||
| 2015-01-03 | 16.4480 | 5.5512 |
| 2015-01-04 | 288.0600 | -65.9901 |
| 2015-01-05 | 19.5360 | 4.8840 |
| 2015-01-06 | 4407.1000 | 1358.0524 |
| 2015-01-07 | 87.1580 | -71.9621 |
| ... | ... | ... |
| 2018-12-26 | 814.5940 | 61.1202 |
| 2018-12-27 | 177.6360 | -31.9742 |
| 2018-12-28 | 1657.3508 | 253.1188 |
| 2018-12-29 | 2915.5340 | 644.4338 |
| 2018-12-30 | 713.7900 | 101.5365 |
1236 rows × 2 columns
# 索引和切片
print(data.loc['2018'],'\n') # 2018年的记录
print(data.loc['2018-02'],'\n') # 2018年2月的记录
print(data['2018-12-01':'2018-12-15']) # 切片
Sales Profit
Order Date
2018-01-01 1481.8280 -181.4109
2018-01-02 2079.5540 -207.0473
2018-01-03 2070.2720 704.2800
2018-01-06 33.7400 15.5204
2018-01-07 3395.5900 758.7192
... ... ...
2018-12-26 814.5940 61.1202
2018-12-27 177.6360 -31.9742
2018-12-28 1657.3508 253.1188
2018-12-29 2915.5340 644.4338
2018-12-30 713.7900 101.5365
[322 rows x 2 columns]
Sales Profit
Order Date
2018-02-02 913.3540 170.6770
2018-02-03 922.3270 215.5700
2018-02-04 32.6700 8.4942
2018-02-05 2263.0120 74.8820
2018-02-06 904.3540 204.3158
2018-02-09 773.7640 -411.9726
2018-02-10 227.1030 28.1274
2018-02-11 1241.5160 130.1018
2018-02-13 1058.4300 424.3345
2018-02-16 1337.4420 95.9756
2018-02-17 2964.8174 -383.5478
2018-02-18 287.3260 62.4082
2018-02-19 1314.5900 377.0515
2018-02-20 1150.2900 -107.5121
2018-02-21 47.9040 -2.9940
2018-02-23 117.8000 42.3700
2018-02-24 1448.6760 249.3929
2018-02-25 430.4920 -19.3798
2018-02-26 2847.6460 447.3532
2018-02-28 17.6200 8.2242
Sales Profit
Order Date
2018-12-01 5331.178 718.8920
2018-12-02 9951.182 -7.3410
2018-12-03 1403.842 280.7407
2018-12-04 2639.638 -21.9881
2018-12-05 1453.136 447.6235
2018-12-06 10.680 2.8836
2018-12-07 2916.514 -2686.6673
2018-12-08 7643.041 1154.6045
2018-12-09 5470.390 1487.1418
2018-12-10 3873.559 715.5696
2018-12-11 2823.965 -82.4089
2018-12-13 580.936 99.2154
2018-12-14 3897.714 215.2500
2018-12-15 306.888 52.5946
# 绘制时间序列图:2018年每天的销售额(Sales)和利润(Profit)
import plotly.graph_objects as go
import pandas as pd
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data = data.loc['2018']
fig = go.Figure()
fig.add_trace(go.Scatter(
x=data.index,
y=data['Sales'],
name='Sales'
))
fig.add_trace(go.Scatter(
x=data.index,
y=data['Profit'],
name='Profit'
))
fig.update_traces(opacity=0.8)
fig.update_layout(
title='Sales and Profit in 2018'
)
fig.show()
导入数据文件'Sample - Superstore.xls',绘制时间序列图,展示2018年每个月的销售额(Sales)和利润(Profit)。
对Order Date进行groupby操作后,时间戳是每天(D),如果想要将其转换为每月(M),可以通过重新采样来实现。重新采样是指将时间序列从一个频率转换为另一个频率的过程。将更高频率的数据聚合到低频率被称为向下采样,反之则称为向上采样。Pandas对象配有resample方法,与groupby方法类似,调用resample时需要对数据分组,之后再调用聚合函数。
# 重新采样
data = df.groupby('Order Date')[['Sales','Profit']].sum()
data = data.loc['2018'].resample('M').sum()
data
| Sales | Profit | |
|---|---|---|
| Order Date | ||
| 2018-01-31 | 43971.3740 | 7140.4391 |
| 2018-02-28 | 20301.1334 | 1613.8720 |
| 2018-03-31 | 58872.3528 | 14751.8915 |
| 2018-04-30 | 36521.5361 | 933.2900 |
| 2018-05-31 | 44261.1102 | 6342.5828 |
| 2018-06-30 | 52981.7257 | 8223.3357 |
| 2018-07-31 | 45264.4160 | 6952.6212 |
| 2018-08-31 | 63120.8880 | 9040.9557 |
| 2018-09-30 | 87866.6520 | 10991.5556 |
| 2018-10-31 | 77776.9232 | 9275.2755 |
| 2018-11-30 | 118447.8250 | 9690.1037 |
| 2018-12-31 | 83829.3188 | 8483.3468 |
data.index.strftime('%Y-%m') # 转换时间格式
Index(['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
'2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12'],
dtype='object', name='Order Date')
# 绘制时间序列图:2018年每个月的销售额(Sales)和利润(Profit)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=data.index.strftime('%Y-%m'),
y=data['Sales'],
name='Sales',
marker_color='#ff7043'
))
fig.add_trace(go.Scatter(
x=data.index.strftime('%Y-%m'),
y=data['Profit'],
name='Profit',
marker_color='#29b6f6'
))
fig.update_layout(
title='Sales and Profit in 2018',
xaxis=dict(dtick='M1') # X轴刻度显示为每一个月
)
fig.show()